Binary Model - 1 = Good, 0 = Bad

In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

Data Preprocessing¶

In [2]:
# Function to create dataframe from JSON file
def create_dataframe_from_json(filename):
    data = []
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data)

review_df = create_dataframe_from_json('C:/Users/ndhu2/Desktop/Term_3/6_Natural Language Processing/Project/yelp_dataset/yelp_academic_dataset_review.json')
business_df = create_dataframe_from_json('C:/Users/ndhu2/Desktop/Term_3/6_Natural Language Processing/Project/yelp_dataset/yelp_academic_dataset_business.json')
In [17]:
# Merge the two dataframes on the 'business_id' column
merged_df = pd.merge(review_df, business_df, on='business_id', how='left')

# Check the resulting dataframe
print(merged_df.head(10))
                review_id                 user_id             business_id  \
0  KU_O5udG6zpxOg-VcAEodg  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw   
1  BiTunyQ73aT9WBnpR9DZGw  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ   
2  saUsX_uimxRlCVr67Z4Jig  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A   
3  AqPFMleE6RsU23_auESxiA  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA   
4  Sx8TMOWLNuJBWer-0pcmoA  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ   
5  JrIxlS1TzJ-iCu79ul40cQ  eUta8W_HdHMXPzLBBZhL1A  04UD14gamNjLY0IDYVhHJg   
6  6AxgBCNX_PNTOxmbRSwcKQ  r3zeYsv1XFBRA4dJpL78cw  gmjsEdUsKpj9Xxu6pdjH0g   
7  _ZeMknuYdlQcUqng_Im3yg  yfFzsLmaWF2d4Sr0UNbBgg  LHSTtnW3YHCeUkRDGyJOyw   
8  ZKvDG2sBvHVdF5oBNUOpAQ  wSTuiTk-sKNdcFyprzZAjg  B5XSoSG3SfvQGtKEGQ1tSQ   
9  pUycOfUwM8vqX7KjRRhUEA  59MxRhNVhU9MYndMkz0wtw  gebiRewfieSdtt17PTW6Zg   

   stars_x  useful  funny  cool  \
0      3.0       0      0     0   
1      5.0       1      0     1   
2      3.0       0      0     0   
3      5.0       1      0     1   
4      4.0       1      0     1   
5      1.0       1      2     1   
6      5.0       0      2     0   
7      5.0       2      0     0   
8      3.0       1      1     0   
9      3.0       0      0     0   

                                                text                 date  \
0  If you decide to eat here, just be aware it is...  2018-07-07 22:09:11   
1  I've taken a lot of spin classes over the year...  2012-01-03 15:28:18   
2  Family diner. Had the buffet. Eclectic assortm...  2014-02-05 20:30:30   
3  Wow!  Yummy, different,  delicious.   Our favo...  2015-01-04 00:01:03   
4  Cute interior and owner (?) gave us tour of up...  2017-01-14 20:54:15   
5  I am a long term frequent customer of this est...  2015-09-23 23:10:31   
6  Loved this tour! I grabbed a groupon and the p...  2015-01-03 23:21:18   
7  Amazingly amazing wings and homemade bleu chee...  2015-08-07 02:29:16   
8  This easter instead of going to Lopez Lake we ...  2016-03-30 22:46:33   
9  Had a party of 6 here for hibachi. Our waitres...  2016-07-25 07:31:06   

                              name  ... state postal_code   latitude  \
0     Turning Point of North Wales  ...    PA       19454  40.210196   
1       Body Cycle Spinning Studio  ...    PA       19119  39.952103   
2                Kettle Restaurant  ...    AZ       85713  32.207233   
3                            Zaika  ...    PA       19114  40.079848   
4                             Melt  ...    LA       70119  29.962102   
5                         Dmitri's  ...    PA       19147  39.938013   
6       The Voodoo Bone Lady Tours  ...    LA       70170  29.952030   
7                  Fries Rebellion  ...    PA       18951  40.407537   
8       Los Padres National Forest  ...    CA       93105  34.597239   
9  Hibachi Steak House & Sushi Bar  ...    CA       93101  34.416984   

    longitude  stars_y  review_count  is_open  \
0  -75.223639      3.0           169        1   
1  -75.172753      5.0           144        0   
2 -110.980864      3.5            47        1   
3  -75.025080      4.0           181        1   
4  -90.087958      4.0            32        0   
5  -75.148131      4.0           273        0   
6  -90.070334      4.5           359        1   
7  -75.338825      3.5           103        0   
8 -119.510772      4.5            13        1   
9 -119.695556      3.5           488        1   

                                          attributes  \
0  {'NoiseLevel': 'u'average'', 'HasTV': 'False',...   
1  {'BusinessAcceptsCreditCards': 'True', 'GoodFo...   
2  {'RestaurantsReservations': 'True', 'BusinessP...   
3  {'Caters': 'True', 'Ambience': '{'romantic': F...   
4  {'BusinessParking': '{'garage': False, 'street...   
5  {'BusinessParking': '{'garage': False, 'street...   
6                            {'GoodForKids': 'True'}   
7  {'RestaurantsAttire': ''casual'', 'Ambience': ...   
8  {'GoodForKids': 'True', 'BikeParking': 'True',...   
9  {'Corkage': 'False', 'RestaurantsTakeOut': 'Tr...   

                                          categories  \
0  Restaurants, Breakfast & Brunch, Food, Juice B...   
1  Active Life, Cycling Classes, Trainers, Gyms, ...   
2                    Restaurants, Breakfast & Brunch   
3              Halal, Pakistani, Restaurants, Indian   
4  Sandwiches, Beer, Wine & Spirits, Bars, Food, ...   
5         Mediterranean, Restaurants, Seafood, Greek   
6  Supernatural Readings, Tours, Hotels & Travel,...   
7  Beer Bar, Bars, American (New), Gastropubs, Re...   
8                                 Parks, Active Life   
9     Steakhouses, Sushi Bars, Restaurants, Japanese   

                                               hours  
0  {'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'...  
1  {'Monday': '6:30-20:30', 'Tuesday': '6:30-20:3...  
2                                               None  
3  {'Tuesday': '11:0-21:0', 'Wednesday': '11:0-21...  
4  {'Monday': '0:0-0:0', 'Friday': '11:0-17:0', '...  
5  {'Wednesday': '17:30-21:0', 'Thursday': '17:30...  
6  {'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'...  
7  {'Wednesday': '11:0-23:0', 'Thursday': '11:0-2...  
8                                               None  
9                              {'Monday': '0:0-0:0'}  

[10 rows x 22 columns]
In [18]:
merged_df.shape
Out[18]:
(6990280, 22)
In [19]:
merged_df.dtypes
Out[19]:
review_id        object
user_id          object
business_id      object
stars_x         float64
useful            int64
funny             int64
cool              int64
text             object
date             object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars_y         float64
review_count      int64
is_open           int64
attributes       object
categories       object
hours            object
dtype: object
In [20]:
merged_df.head(10)
Out[20]:
review_id user_id business_id stars_x useful funny cool text date name ... state postal_code latitude longitude stars_y review_count is_open attributes categories hours
0 KU_O5udG6zpxOg-VcAEodg mh_-eMZ6K5RLWhZyISBhwA XQfwVwDr-v0ZS3_CbbE5Xw 3.0 0 0 0 If you decide to eat here, just be aware it is... 2018-07-07 22:09:11 Turning Point of North Wales ... PA 19454 40.210196 -75.223639 3.0 169 1 {'NoiseLevel': 'u'average'', 'HasTV': 'False',... Restaurants, Breakfast & Brunch, Food, Juice B... {'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'...
1 BiTunyQ73aT9WBnpR9DZGw OyoGAe7OKpv6SyGZT5g77Q 7ATYjTIgM3jUlt4UM3IypQ 5.0 1 0 1 I've taken a lot of spin classes over the year... 2012-01-03 15:28:18 Body Cycle Spinning Studio ... PA 19119 39.952103 -75.172753 5.0 144 0 {'BusinessAcceptsCreditCards': 'True', 'GoodFo... Active Life, Cycling Classes, Trainers, Gyms, ... {'Monday': '6:30-20:30', 'Tuesday': '6:30-20:3...
2 saUsX_uimxRlCVr67Z4Jig 8g_iMtfSiwikVnbP2etR0A YjUWPpI6HXG530lwP-fb2A 3.0 0 0 0 Family diner. Had the buffet. Eclectic assortm... 2014-02-05 20:30:30 Kettle Restaurant ... AZ 85713 32.207233 -110.980864 3.5 47 1 {'RestaurantsReservations': 'True', 'BusinessP... Restaurants, Breakfast & Brunch None
3 AqPFMleE6RsU23_auESxiA _7bHUi9Uuf5__HHc_Q8guQ kxX2SOes4o-D3ZQBkiMRfA 5.0 1 0 1 Wow! Yummy, different, delicious. Our favo... 2015-01-04 00:01:03 Zaika ... PA 19114 40.079848 -75.025080 4.0 181 1 {'Caters': 'True', 'Ambience': '{'romantic': F... Halal, Pakistani, Restaurants, Indian {'Tuesday': '11:0-21:0', 'Wednesday': '11:0-21...
4 Sx8TMOWLNuJBWer-0pcmoA bcjbaE6dDog4jkNY91ncLQ e4Vwtrqf-wpJfwesgvdgxQ 4.0 1 0 1 Cute interior and owner (?) gave us tour of up... 2017-01-14 20:54:15 Melt ... LA 70119 29.962102 -90.087958 4.0 32 0 {'BusinessParking': '{'garage': False, 'street... Sandwiches, Beer, Wine & Spirits, Bars, Food, ... {'Monday': '0:0-0:0', 'Friday': '11:0-17:0', '...
5 JrIxlS1TzJ-iCu79ul40cQ eUta8W_HdHMXPzLBBZhL1A 04UD14gamNjLY0IDYVhHJg 1.0 1 2 1 I am a long term frequent customer of this est... 2015-09-23 23:10:31 Dmitri's ... PA 19147 39.938013 -75.148131 4.0 273 0 {'BusinessParking': '{'garage': False, 'street... Mediterranean, Restaurants, Seafood, Greek {'Wednesday': '17:30-21:0', 'Thursday': '17:30...
6 6AxgBCNX_PNTOxmbRSwcKQ r3zeYsv1XFBRA4dJpL78cw gmjsEdUsKpj9Xxu6pdjH0g 5.0 0 2 0 Loved this tour! I grabbed a groupon and the p... 2015-01-03 23:21:18 The Voodoo Bone Lady Tours ... LA 70170 29.952030 -90.070334 4.5 359 1 {'GoodForKids': 'True'} Supernatural Readings, Tours, Hotels & Travel,... {'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'...
7 _ZeMknuYdlQcUqng_Im3yg yfFzsLmaWF2d4Sr0UNbBgg LHSTtnW3YHCeUkRDGyJOyw 5.0 2 0 0 Amazingly amazing wings and homemade bleu chee... 2015-08-07 02:29:16 Fries Rebellion ... PA 18951 40.407537 -75.338825 3.5 103 0 {'RestaurantsAttire': ''casual'', 'Ambience': ... Beer Bar, Bars, American (New), Gastropubs, Re... {'Wednesday': '11:0-23:0', 'Thursday': '11:0-2...
8 ZKvDG2sBvHVdF5oBNUOpAQ wSTuiTk-sKNdcFyprzZAjg B5XSoSG3SfvQGtKEGQ1tSQ 3.0 1 1 0 This easter instead of going to Lopez Lake we ... 2016-03-30 22:46:33 Los Padres National Forest ... CA 93105 34.597239 -119.510772 4.5 13 1 {'GoodForKids': 'True', 'BikeParking': 'True',... Parks, Active Life None
9 pUycOfUwM8vqX7KjRRhUEA 59MxRhNVhU9MYndMkz0wtw gebiRewfieSdtt17PTW6Zg 3.0 0 0 0 Had a party of 6 here for hibachi. Our waitres... 2016-07-25 07:31:06 Hibachi Steak House & Sushi Bar ... CA 93101 34.416984 -119.695556 3.5 488 1 {'Corkage': 'False', 'RestaurantsTakeOut': 'Tr... Steakhouses, Sushi Bars, Restaurants, Japanese {'Monday': '0:0-0:0'}

10 rows × 22 columns

In [21]:
# Drop unnecessary features
df = merged_df[['text', 'stars_x', 'categories','state','name']]
In [22]:
# Check the resulting dataframe
df.head(10)
Out[22]:
text stars_x categories state name
0 If you decide to eat here, just be aware it is... 3.0 Restaurants, Breakfast & Brunch, Food, Juice B... PA Turning Point of North Wales
1 I've taken a lot of spin classes over the year... 5.0 Active Life, Cycling Classes, Trainers, Gyms, ... PA Body Cycle Spinning Studio
2 Family diner. Had the buffet. Eclectic assortm... 3.0 Restaurants, Breakfast & Brunch AZ Kettle Restaurant
3 Wow! Yummy, different, delicious. Our favo... 5.0 Halal, Pakistani, Restaurants, Indian PA Zaika
4 Cute interior and owner (?) gave us tour of up... 4.0 Sandwiches, Beer, Wine & Spirits, Bars, Food, ... LA Melt
5 I am a long term frequent customer of this est... 1.0 Mediterranean, Restaurants, Seafood, Greek PA Dmitri's
6 Loved this tour! I grabbed a groupon and the p... 5.0 Supernatural Readings, Tours, Hotels & Travel,... LA The Voodoo Bone Lady Tours
7 Amazingly amazing wings and homemade bleu chee... 5.0 Beer Bar, Bars, American (New), Gastropubs, Re... PA Fries Rebellion
8 This easter instead of going to Lopez Lake we ... 3.0 Parks, Active Life CA Los Padres National Forest
9 Had a party of 6 here for hibachi. Our waitres... 3.0 Steakhouses, Sushi Bars, Restaurants, Japanese CA Hibachi Steak House & Sushi Bar
In [23]:
# Display the number of records with missing values
missing_values_count = df.isnull().sum()
print("Number of records with missing values:")
print(missing_values_count)

# Display the number of duplicate records
duplicate_records_count = df.duplicated().sum()
print("\nNumber of duplicate records:")
print(duplicate_records_count)

# Display the number of records before any cleaning
total_records_before_cleaning = len(df)
print("\nTotal number of records before cleaning:", total_records_before_cleaning)
Number of records with missing values:
text            0
stars_x         0
categories    689
state           0
name            0
dtype: int64

Number of duplicate records:
12007

Total number of records before cleaning: 6990280
In [24]:
# Find duplicate records
duplicate_mask = df.duplicated(keep=False)
duplicate_records = df[duplicate_mask]
In [25]:
duplicate_records.head(20)
Out[25]:
text stars_x categories state name
749 I've always had good experiences here. The foo... 4.0 Restaurants, American (New) AZ Nox Kitchen + Cocktails
1283 Great food... Love the Fish tacos & Nachos are... 5.0 Seafood, American (Traditional), Music Venues,... FL Sam's Beach Bar
2244 We were back last night for the second time: a... 5.0 Restaurants, Nightlife, Japanese, Bars CA Yoichi's
2855 I ate like a Queen! A breakfeast Queen- and I... 5.0 Restaurants, Southern, Cajun/Creole, Seafood, ... LA Mena's Palace
3311 We are visiting in Philadelphia from Californi... 5.0 Active Life, Beer Gardens, Grocery, Middle Eas... PA Suraya
3572 While I was moving in and had all my stuff on ... 1.0 Home Services, Real Estate, Apartments FL Egret's Landing Apartments
4094 I have no idea what these people are talking a... 3.0 Restaurants, New Mexican Cuisine, Mexican LA Burritos Grill Mexican Fresh Cuisine
4391 This Wendy's is a good location. Open 24 hours... 4.0 Hot Dogs, American (New), Food, Burgers, Fast ... NJ Wendy's
4673 The person next door reported a leak coming fr... 5.0 Home Services, Plumbing FL Tom Shell Plumbing
4873 The wash was very average and very little atte... 3.0 Auto Detailing, Car Wash, Automotive FL Mike's Auto Detailing
5478 Awesome food...everything was fresh and made t... 5.0 Japanese, Korean, Asian Fusion, Tacos, Mexican... TN Soy Bistro
5542 Stopped in here with my wife and a friend for ... 1.0 Fast Food, Event Planning & Services, Restaura... IN QDOBA Mexican Eats
5960 We went with the simple choice and were not di... 5.0 Restaurants, Sandwiches, Food, Breakfast & Bru... TN The Pancake Pantry
6006 love this place! The owners are 3 brothers rig... 5.0 Pizza, Restaurants PA Spatola's Pizza
6306 Beware of smiling Joe! We bought living room f... 1.0 Home & Garden, Home Decor, Furniture Stores, S... FL Ashley HomeStore
6424 I brought my cousin here on Christmas night. T... 1.0 Bars, Nightlife, American (Traditional), Music... NV Brew Brothers
6689 We walked into this bar last night - December ... 5.0 Restaurants, Tiki Bars, Mexican, Bars, Cocktai... LA Tiki Tolteca
7001 Fabulous!!! My family orders from here all the... 5.0 Seafood, Pizza, Restaurants, Italian PA Main Street Pizzeria & Grille
7021 Diner food does not get any better than this!!... 5.0 Restaurants, Diners, Breakfast & Brunch FL Pop N Sons Diner
7522 Every time I go there the service is terrible ... 1.0 Barbeque, Restaurants, American (Traditional),... LA Dickey's Barbecue Pit

After identifying that the duplicated records appear to be regular or normal entries, we concluded that their presence is likely the result of an error during data handling or uploading. Given that they do not provide any additional information, keeping them in the dataset is unlikely to have a positive impact on model performance. Therefore, we plan to remove these duplicate records as part of the data cleaning process.

In [26]:
# Drop rows with missing values
df.dropna(inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Display the number of records after cleaning
total_records_after_cleaning = len(df)
print("\nTotal number of records after cleaning:", total_records_after_cleaning)
C:\Users\ndhu2\AppData\Local\Temp\ipykernel_20132\611006536.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
Total number of records after cleaning: 6977585
C:\Users\ndhu2\AppData\Local\Temp\ipykernel_20132\611006536.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)

Exploratory Data Analysis¶

In [27]:
df.describe(include='all')
Out[27]:
text stars_x categories state name
count 6977585 6.977585e+06 6977585 6977585 6977585
unique 6973440 NaN 83160 27 114026
top DO NOT PARK HERE!\nthey are too quick to boot ... NaN Mexican, Restaurants PA Starbucks
freq 18 NaN 54841 1596380 21532
mean NaN 3.748519e+00 NaN NaN NaN
std NaN 1.478515e+00 NaN NaN NaN
min NaN 1.000000e+00 NaN NaN NaN
25% NaN 3.000000e+00 NaN NaN NaN
50% NaN 4.000000e+00 NaN NaN NaN
75% NaN 5.000000e+00 NaN NaN NaN
max NaN 5.000000e+00 NaN NaN NaN
In [28]:
# Stars Distribution Analysis
df['stars_x'].hist(bins=20, figsize=(8, 6))
plt.title('Distribution of Star Ratings')
plt.xlabel('Star Rating')
plt.ylabel('Frequency')
plt.show()
No description has been provided for this image

As the dataset exhibits an uneven distribution of star ratings, developing a model to predict or identify the sentiment of reviews poses a challenge. The significant difference between the majority class (5 stars) and the minority class (2 stars) may introduce bias during model training. To mitigate this issue, we plan to implement undersampling after splitting the data into training and test datasets, leveraging the abundance of available data.

In [29]:
# Save the DataFrame as a CSV file
df.to_csv('yelp_data.csv', index=False)
In [65]:
new_df = pd.read_csv('yelp_data.csv')
In [66]:
new_df.head()
Out[66]:
text stars_x categories state name
0 If you decide to eat here, just be aware it is... 3.0 Restaurants, Breakfast & Brunch, Food, Juice B... PA Turning Point of North Wales
1 I've taken a lot of spin classes over the year... 5.0 Active Life, Cycling Classes, Trainers, Gyms, ... PA Body Cycle Spinning Studio
2 Family diner. Had the buffet. Eclectic assortm... 3.0 Restaurants, Breakfast & Brunch AZ Kettle Restaurant
3 Wow! Yummy, different, delicious. Our favo... 5.0 Halal, Pakistani, Restaurants, Indian PA Zaika
4 Cute interior and owner (?) gave us tour of up... 4.0 Sandwiches, Beer, Wine & Spirits, Bars, Food, ... LA Melt
In [67]:
new_df.shape
Out[67]:
(6977585, 5)
In [68]:
new_df.dtypes
Out[68]:
text           object
stars_x       float64
categories     object
state          object
name           object
dtype: object
In [69]:
# Create a new feature containing the length of the reviews
new_df['review_length'] = new_df['text'].apply(len)
In [70]:
new_df.head()
Out[70]:
text stars_x categories state name review_length
0 If you decide to eat here, just be aware it is... 3.0 Restaurants, Breakfast & Brunch, Food, Juice B... PA Turning Point of North Wales 513
1 I've taken a lot of spin classes over the year... 5.0 Active Life, Cycling Classes, Trainers, Gyms, ... PA Body Cycle Spinning Studio 829
2 Family diner. Had the buffet. Eclectic assortm... 3.0 Restaurants, Breakfast & Brunch AZ Kettle Restaurant 339
3 Wow! Yummy, different, delicious. Our favo... 5.0 Halal, Pakistani, Restaurants, Indian PA Zaika 243
4 Cute interior and owner (?) gave us tour of up... 4.0 Sandwiches, Beer, Wine & Spirits, Bars, Food, ... LA Melt 534

Text Preprocessing¶

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ndhu2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ndhu2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ndhu2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Out[2]:
True

We decided to handle contractions in our dataset to reduce computational power requirements by reducing the dimensionality of the data and unify the impact of previously contracted words.

In [31]:
import re

# Dictionary of English contractions
contractions_dict = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it had",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'alls": "you alls",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}
In [32]:
# Function to expand contractions
def expand_contractions(text, contractions_dict):
    # Regular expression for finding contractions
    contractions_pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) + r')\b', flags=re.IGNORECASE)
    
    # Preprocess the keys of contractions_dict to lowercase
    processed_dict = {key.lower(): value for key, value in contractions_dict.items()}
    
    def expand_match(contraction):
        match = contraction.group(0)
        expanded_contraction = processed_dict.get(match.lower(), match)
        return expanded_contraction
    
    expanded_text = contractions_pattern.sub(expand_match, text)
    return expanded_text
In [87]:
# Previous text preprocessisng
"""import unicodedata

# Initialize WordNet Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function for text preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Normalize text (Remove accents, diacritics, etc.)
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

    # Expand contractions
    text = expand_contractions(text, contractions_dict)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Apply text preprocessing to the 'text' column
new_df['preprocessed_text'] = new_df['text'].apply(preprocess_text)
"""
In [33]:
import re
import unicodedata
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string




# Function for text preprocessing
def preprocess_text(text):

    # Initialize WordNet Lemmatizer and stopwords
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Convert text to lowercase
    text = text.lower()

    # Remove URLs using regular expression
    text = re.sub(r'http\S+', '', text)

    # Normalize text (Remove accents, diacritics, etc.)
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

    # Expand contractions
    text = expand_contractions(text, contractions_dict)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Initialize a variable to keep track of whether the current token follows a negation
    following_negation = False
    
    # Iterate over tokens and modify sentiment of words that follow negations
    for i in range(len(tokens)):
        token = tokens[i]
        if token in ["not", "no"]:
            following_negation = True
        elif following_negation:
            # Modify sentiment of the word
            tokens[i] = "not_" + tokens[i]
            following_negation = False
    
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    # Remove non-alphanumeric characters except for words starting with 'not_'
    tokens = [token for token in tokens if token.isalnum() or token.startswith('not_')]
    
    # Whitespace removal
    tokens = [token.strip() for token in tokens if token is not None]
    
    # Lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text
In [34]:
# Test text containing "didn't"
test_text = "I didn't like it. It wasn't good at all. But at the end, it wasn't that bad"

# Preprocess the test text
preprocessed_test_text = preprocess_text(test_text)

# Display the preprocessed test text
print("Preprocessed test text:", preprocessed_test_text)
Preprocessed test text: not_like not_good end not_that bad
In [76]:
# Apply text preprocessing to the 'text' column
new_df['preprocessed_text'] = new_df['text'].apply(preprocess_text)
In [86]:
"""import unicodedata

# Initialize WordNet Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function for text preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Expand contractions
    text = expand_contractions(text, contractions_dict)

    # Normalize text (Remove accents, diacritics, etc.)
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Apply text preprocessing to the 'text' column
sample_df = pd.DataFrame()
# Take the first 20 records of the 'text' column from new_df
texts_to_process = new_df['text'].iloc[:20]

# Apply text preprocessing to the selected texts and assign it to the new column 'preprocessed_text'
sample_df['preprocessed_text'] = texts_to_process.apply(preprocess_text)

# Display the resulting DataFrame
print(sample_df)"""
                                    preprocessed_text
0   decide eat aware going take 2 hour beginning e...
1   I taken lot spin class year nothing compare cl...
2   family diner buffet eclectic assortment large ...
3   wow yummy different delicious favorite lamb cu...
4   cute interior owner gave u tour upcoming patio...
5   long term frequent customer establishment went...
6   loved tour grabbed groupon price great perfect...
7   amazingly amazing wing homemade bleu cheese ri...
8   easter instead going lopez lake went los padre...
9   party 6 hibachi waitress brought separate sush...
10  experience shalimar nothing wonderful wanted g...
11  local recommended milktooth amazing jewel indi...
12  love going happy hour dinner great patio fan b...
13  good food -- loved gnocchi marinara baked eggp...
14  bun make sonoran dog like snuggie pup first se...
15  great place breakfast waffle fluffy perfect ho...
16  tremendous service big shout douglas complemen...
17  hubby multiple occasion loved every part meal ...
18  go blow bar get brow done natalie brow special...
19  absolute favorite cafe city black white latte ...
In [ ]:
"""# Initialize WordNet Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove URLs using regular expression
    text = re.sub(r'http\S+', '', text)
    
    # Remove non-alphanumeric characters, but keep numbers
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Apply text preprocessing to the 'text' column
new_df['preprocessed_text'] = new_df['text'].apply(preprocess_text)"""

We decided to implement Lemmatization instead of Stemming as this one tends to give more accurate linguistic results, which can lead to a better model performance.

Data Engineering¶

In [77]:
new_df.head(50)
Out[77]:
text stars_x categories state name review_length preprocessed_text
0 If you decide to eat here, just be aware it is... 3.0 Restaurants, Breakfast & Brunch, Food, Juice B... PA Turning Point of North Wales 513 decide eat aware going take 2 hour beginning e...
1 I've taken a lot of spin classes over the year... 5.0 Active Life, Cycling Classes, Trainers, Gyms, ... PA Body Cycle Spinning Studio 829 I taken lot spin class year nothing compare cl...
2 Family diner. Had the buffet. Eclectic assortm... 3.0 Restaurants, Breakfast & Brunch AZ Kettle Restaurant 339 family diner buffet eclectic assortment large ...
3 Wow! Yummy, different, delicious. Our favo... 5.0 Halal, Pakistani, Restaurants, Indian PA Zaika 243 wow yummy different delicious favorite lamb cu...
4 Cute interior and owner (?) gave us tour of up... 4.0 Sandwiches, Beer, Wine & Spirits, Bars, Food, ... LA Melt 534 cute interior owner gave u tour upcoming area ...
5 I am a long term frequent customer of this est... 1.0 Mediterranean, Restaurants, Seafood, Greek PA Dmitri's 341 long term frequent customer establishment went...
6 Loved this tour! I grabbed a groupon and the p... 5.0 Supernatural Readings, Tours, Hotels & Travel,... LA The Voodoo Bone Lady Tours 804 loved tour grabbed groupon price great perfect...
7 Amazingly amazing wings and homemade bleu chee... 5.0 Beer Bar, Bars, American (New), Gastropubs, Re... PA Fries Rebellion 192 amazingly amazing wing homemade bleu cheese ri...
8 This easter instead of going to Lopez Lake we ... 3.0 Parks, Active Life CA Los Padres National Forest 526 easter instead going lopez lake went los padre...
9 Had a party of 6 here for hibachi. Our waitres... 3.0 Steakhouses, Sushi Bars, Restaurants, Japanese CA Hibachi Steak House & Sushi Bar 524 party 6 hibachi waitress brought separate sush...
10 My experience with Shalimar was nothing but wo... 5.0 Shopping, Jewelry FL Shalimar Fine Jewelers 1009 experience shalimar nothing wonderful wanted g...
11 Locals recommended Milktooth, and it's an amaz... 4.0 Beer, Wine & Spirits, Cafes, Coffee & Tea, Res... IN Milktooth 119 local recommended milktooth amazing jewel indi...
12 Love going here for happy hour or dinner! Gre... 4.0 Bars, Pizza, Nightlife, Cocktail Bars, Italian... MO Brio Italian Grille 242 love going happy hour dinner great patio fan b...
13 Good food--loved the gnocchi with marinara\nth... 4.0 Pizza, Restaurants, Italian, Salad PA LaScala's 175 good food loved gnocchi marinara baked eggplan...
14 The bun makes the Sonoran Dog. It's like a snu... 4.0 Restaurants, Tacos, Mexican, Hot Dogs, Breakfa... AZ BK Tacos 658 bun make sonoran dog like snuggie pup first se...
15 Great place for breakfast! I had the waffle, w... 5.0 Sandwiches, Restaurants, American (New), Ameri... FL Mamas Kitchen 175 great place breakfast waffle fluffy perfect ho...
16 Tremendous service (Big shout out to Douglas) ... 5.0 Wine Bars, Restaurants, Nightlife, Steakhouses... PA Rittenhouse Grill 276 tremendous service big shout douglas complemen...
17 The hubby and I have been here on multiple occ... 4.0 Wine Bars, Bars, Nightlife, American (New), Me... MO Olio 577 hubby multiple occasion loved every part meal ...
18 I go to blow bar to get my brows done by natal... 5.0 Makeup Artists, Blow Dry/Out Services, Beauty ... FL Blow Bar Express Styling Salon 393 go blow bar get brow done natalie brow special...
19 My absolute favorite cafe in the city. Their b... 5.0 Food, Cafes, Coffee & Tea, Restaurants PA Good Karma Cafe 419 absolute favorite cafe city black white latte ...
20 HOLY SMOKES!\n\nactual pumpkin pie mixed in wi... 5.0 Ice Cream & Frozen Yogurt, Food, Local Flavor,... MO Ted Drewes 249 holy smoke actual pumpkin pie mixed frozen cus...
21 Upland is a brewery based out of Bloomington, ... 3.0 Nightlife, Food, Bars, Breweries, Pizza, Brewp... IN Upland Carmel Tap House 736 upland brewery based bloomington indiana becom...
22 I thoroughly enjoyed the show. Chill way to s... 5.0 Performing Arts, Arts & Entertainment, Nightli... PA The N Crowd 66 thoroughly enjoyed show chill way spend friday...
23 Yes, this is the only sushi place in town. How... 4.0 Restaurants, Sushi Bars CA Sushi Teri 325 yes sushi place town however great craving sus...
24 I was really between 3 and 4 stars for this on... 4.0 Restaurants, Food, Poke, Hawaiian, Sushi Bars IN Naked Tchopstix Express 1555 really 3 4 star one love 96th street naked tch...
25 Went for lunch. Beef brisket sandwich was awes... 4.0 American (New), Restaurants, Cocktail Bars, Ba... IN Barbecue and Bourbon 110 went lunch beef brisket sandwich awesome juicy...
26 Best thai food in the area. Everything was au... 5.0 Thai, Restaurants PA Thai Place Restaurant 110 best thai food area everything authentic delic...
27 Service was crappy, and food was mediocre. I ... 3.0 Cajun/Creole, Seafood, Restaurants, Breakfast ... LA Creole House Restaurant & Oyster Bar 115 service crappy food mediocre wish would picked...
28 I recently had dinner here with my wife over t... 5.0 Event Planning & Services, Italian, Venues & E... PA Anthony's at Paxon Hollow 479 recently dinner wife weekend could not_have pl...
29 I at least have to give this restaurant two st... 2.0 Cocktail Bars, Nightlife, Gastropubs, Sports B... TN Tavern 473 least give restaurant two star due decent food...
30 First time there and it was excellent!!! It fe... 5.0 Restaurants, Seafood, Cafes, Italian PA Portobello Cafe 222 first time excellent feel like entering someon...
31 Great burgers,fries and salad! Burgers have a... 5.0 Fast Food, Burgers, Restaurants CA The Original Habit Burger Grill 209 great burger fry salad burger hint salt pepper...
32 Great staff always helps and always nice. Alwa... 5.0 Food, Coffee & Tea, Gas Stations, Restaurants,... PA Wawa 169 great staff always help always nice always cle...
33 Took my vehicle here for some work a few years... 5.0 Auto Repair, Smog Check Stations, Auto Parts &... NV Landa Muffler & Brake 384 took vehicle work year ago manufacturer recall...
34 After my ROTD yesterday of a different Sweet ... 4.0 Food, Ice Cream & Frozen Yogurt TN Sweet Cece's 490 rotd yesterday different sweet cece location r...
35 What a great addition to the Funk Zone! Grab ... 5.0 Food, Restaurants, Salad, Coffee & Tea, Breakf... CA Helena Avenue Bakery 222 great addition funk zone grab bite grab tastin...
36 Nice relaxing place to get a massage! Same day... 5.0 Health & Medical, Beauty & Spas, Massage, Phys... NV Ralston Massage Center 156 nice relaxing place get massage day appointmen...
37 We checked in around 2:30 pm. Check-in was qu... 4.0 Event Planning & Services, Casinos, Beauty & S... NV Peppermill Reno 1369 checked around pm quick easy complimentary val...
38 My boyfriend and I tried this deli for the fir... 5.0 Restaurants, Delis, Salad, Sandwiches PA The Coventry Deli 393 boyfriend tried deli first time today turkey a...
39 Amazing biscuits and (fill in the blank). Grea... 5.0 American (New), Restaurants, American (Traditi... TN Milk and Honey Nashville 101 amazing biscuit fill blank great cocktail high...
40 Food was good- atmosphere/decor is like a fish... 4.0 Seafood, Steakhouses, Salad, Comfort Food, Res... FL Aquafinz 222 food like fishing menu someplace outback bonef...
41 Straight to the point, it's cheap, it tastes a... 2.0 American (New), Restaurants, Buffets, Breakfas... NV The Buffet 621 straight point cheap taste feel cheap good pri...
42 The only reason I didn't give this restaurant ... 4.0 American (New), Breakfast & Brunch, Bars, Nigh... PA Square 1682 2343 reason not_give restaurant 5 star rating one s...
43 Stopped by after a Sunday morning walk in the ... 5.0 Bagels, Sporting Goods, Outdoor Gear, Coffee &... TN Three Brothers Coffee 112 stopped sunday morning walk park great food co...
44 In a word... "OVERRATED!". The food took fore... 3.0 Bars, Breakfast & Brunch, Restaurants, Barbequ... LA Mr. B's Bistro 363 word overrated food took forever come burger w...
45 Comfortable bed, good breakfast, fast internet... 4.0 Hotels, Event Planning & Services, Caterers, H... PA DoubleTree by Hilton Hotel Philadelphia Center... 233 comfortable bed good breakfast fast internet g...
46 NEVER AGAIN. This is a so called restaurant th... 2.0 Jazz & Blues, Bars, Arts & Entertainment, Beer... LA Bacchanal Fine Wine & Spirits 590 never called restaurant nothing restaurant pre...
47 If you want to pay for everything a la carte t... 1.0 American (New), Restaurants, Mexican FL El Chicanito Mexican Restaurant 1016 want pay everything la carte place food not_te...
48 The cafe was extremely cute. We came at 8am an... 4.0 Sandwiches, Breakfast & Brunch, Cajun/Creole, ... LA Cafe Beignet on Bourbon Street 248 cafe extremely cute came 8am even jazz band pl...
49 On a scale of one to things that are awesome, ... 5.0 Bars, Nightlife, Whiskey Bars, Burgers, Restau... PA Village Whiskey 934 scale one thing awesome place bomb drawn promi...
In [78]:
# Save the DataFrame as a CSV file
new_df.to_csv('yelp_lemmatized_data.csv', index=False)

Model Training¶

In [7]:
# Create dataframe
df = pd.read_csv('yelp_lemmatized_data.csv')
In [8]:
df.dtypes
Out[8]:
text                  object
stars_x              float64
categories            object
state                 object
name                  object
review_length          int64
preprocessed_text     object
dtype: object
In [26]:
# EDA

# Distribution of Review Lengths by Sentiment considering a uniform distribution

import matplotlib.pyplot as plt
import seaborn as sns

# Filter positive and negative sentiment reviews
positive_reviews = df[df['stars_x'] >= 4]['review_length']
negative_reviews = df[df['stars_x'] <= 2]['review_length']

# Set up the figure and axes
plt.figure(figsize=(10, 6))

# Plot histogram for positive sentiment reviews
sns.histplot(positive_reviews, color='green', kde=True, label='Positive Sentiment')

# Plot histogram for negative sentiment reviews
sns.histplot(negative_reviews, color='red', kde=True, label='Negative Sentiment')

# Add labels and title
plt.xlabel('Review Length')
plt.ylabel('Frequency')
plt.title('Distribution of Review Lengths by Sentiment')
plt.legend()

# Show plot
plt.show()
d:\Users\ndhu2\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
d:\Users\ndhu2\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
No description has been provided for this image
In [27]:
# Distribution of Review Lengths by Sentiment considering the real distribution

import matplotlib.pyplot as plt
import seaborn as sns

# Filter positive and negative sentiment reviews
positive_reviews = df[df['stars_x'] >= 4]['review_length']
negative_reviews = df[df['stars_x'] <= 2]['review_length']

# Set up the figure and axes
plt.figure(figsize=(10, 6))

# Plot histogram for positive sentiment reviews
sns.histplot(positive_reviews, color='green', kde=True, label='Positive Sentiment')

# Plot histogram for negative sentiment reviews
sns.histplot(negative_reviews, color='red', kde=True, label='Negative Sentiment')

# Add labels and title
plt.xlabel('Review Length')
plt.ylabel('Frequency')
plt.title('Distribution of Review Lengths by Sentiment')
plt.legend()

# Show plot
plt.show()
d:\Users\ndhu2\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
d:\Users\ndhu2\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
No description has been provided for this image
In [28]:
# Distribution of Review Lengths by Sentiment considering a uniform distribution

# Set up the figure and axes
plt.figure(figsize=(10, 6))

# Create a box plot of review lengths by sentiment
sns.boxplot(data=df, x='stars_x', y='review_length')

# Add labels and title
plt.xlabel('Stars')
plt.ylabel('Review Length')
plt.title('Distribution of Review Lengths by Sentiment')

# Show plot
plt.show()
No description has been provided for this image
In [9]:
# Display the number of records with missing values
missing_values_count = df.isnull().sum()
print("Number of records with missing values:")
print(missing_values_count)
Number of records with missing values:
text                   0
stars_x                0
categories             0
state                  0
name                   0
review_length          0
preprocessed_text    238
dtype: int64
In [10]:
# Drop rows with missing values in the 'preprocessed_text' column
df = df.dropna(subset=['preprocessed_text'])

We chose TF-IDF instead of other techniques, such as CountVectorizer, because it not only considers the frequency of the words in just one document but also the frequency of the words across the corpus (all the documents). This can give an additional element to the model, allowing it to learn more from the weight that each word has from a more general perspective.

In [11]:
# Remove the rows with 3-star ratings
df = df[df['stars_x'] != 3]
In [12]:
X = df['preprocessed_text']
y = df['stars_x']
In [76]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [80]:
# Undersampling Process
# Identify the size of the minority class
minority_class_size = y_train.value_counts().min()

# Undersample the majority classes to match the size of the minority class
undersampled_indices = pd.concat([
    y_train[y_train == 1].sample(minority_class_size, replace=False),
    y_train[y_train == 2].sample(minority_class_size, replace=False),
    y_train[y_train == 4].sample(minority_class_size, replace=False),
    y_train[y_train == 5].sample(minority_class_size, replace=False)
]).index

X_train_u = X_train.loc[undersampled_indices]
y_train_u = y_train.loc[undersampled_indices]
In [81]:
# Map ratings to binary labels (1 for good, 0 for bad)
y_train_u = y_train_u.map({1: 0, 2: 0, 4: 1, 5: 1})
y_test = y_test.map({1: 0, 2: 0, 4: 1, 5: 1})
In [85]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB

# Define a pipeline with TF-IDF vectorizer and Bernoulli Naive Bayes classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', BernoulliNB())
])

# Define a range of min_df values to search
min_df = [1, 2, 4, 8, 12, 16, 20, 40, 60, 80, 160]

# Define parameters grid for grid search
param_grid = {
    'tfidf__min_df': min_df,
}

# Perform grid search

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_u, y_train_u)

# Get the best min_df value
best_min_df = grid_search.best_params_['tfidf__min_df']
print("Best min_df:", best_min_df)

# Train the model with the best min_df value
best_model = grid_search.best_estimator_
best_model.fit(X_train_u, y_train_u)

# Evaluate the best model
accuracy = best_model.score(X_test, y_test)
print("Accuracy:", accuracy)
Best min_df: 2
Accuracy: 0.8780249576655061
In [86]:
import matplotlib.pyplot as plt

# Extract grid search results
results = grid_search.cv_results_

# Extract mean test scores and corresponding max_features values
mean_test_scores = results['mean_test_score']
max_features_values = [params['tfidf__min_df'] for params in results['params']]

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(max_features_values, mean_test_scores, marker='o')
plt.title('Mean Test Accuracy vs. Min Document Frequency')
plt.xlabel('Min Document Frequency')
plt.ylabel('Mean Test Accuracy')
plt.xticks(max_features_values)
plt.grid(True)
plt.show()
No description has been provided for this image
In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=2)

# Fit and transform the preprocessed_text feature (X -> TF-IDF Matrix)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_u)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
In [12]:
import pickle

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)
In [13]:
X_train_tfidf.shape
Out[13]:
(1738088, 139023)
In [14]:
from sklearn.naive_bayes import BernoulliNB

# Initialize and train the Bernoulli Naive Bayes classifier
nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_tfidf, y_train_u)
Out[14]:
BernoulliNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
BernoulliNB()
In [39]:
# Predict sentiment on the test set
y_test_pred = nb_classifier.predict(X_test_tfidf)

# Predict labels for the training dataset
y_train_pred = nb_classifier.predict(X_train_tfidf)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[39], line 2
      1 # Predict sentiment on the test set
----> 2 y_test_pred = nb_classifier.predict(X_test_tfidf)
      4 # Predict labels for the training dataset
      5 y_train_pred = nb_classifier.predict(X_train_tfidf)

NameError: name 'X_test_tfidf' is not defined
In [16]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluate the model
accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy)

# Calculate accuracy on the training dataset
train_accuracy = accuracy_score(y_train_u, y_train_pred)
print("Training Accuracy:", train_accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_test_pred))
Test Accuracy: 0.8831434614090583
Training Accuracy: 0.8330199621653219
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.78      0.77    321703
           1       0.92      0.92      0.92    935548

    accuracy                           0.88   1257251
   macro avg       0.85      0.85      0.85   1257251
weighted avg       0.88      0.88      0.88   1257251

In [17]:
import pickle

# Save the trained model
with open('Model3/nb_bin_model3.pkl', 'wb') as file:
    pickle.dump(nb_classifier, file)
In [47]:
# Load the saved model
with open('Model3/nb_bin_model3.pkl', 'rb') as file:
    loaded_classifier = pickle.load(file)
In [98]:
# Predict labels for the test dataset
y_test_pred = loaded_classifier.predict(X_test_tfidf)

# Calculate accuracy on the test dataset
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy:", test_accuracy)

# Predict labels for the training dataset
y_train_pred = loaded_classifier.predict(X_train_tfidf)

# Calculate accuracy on the training dataset
train_accuracy = accuracy_score(y_train_u, y_train_pred)
print("Training Accuracy:", train_accuracy)
Accuracy: 0.8780249576655061
Training Accuracy: 0.8266599408001176
In [99]:
from sklearn.metrics import confusion_matrix

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Sentiment')
plt.ylabel('True Sentiment')
plt.show()
No description has been provided for this image
In [100]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Initialize SVM classifier
svm_classifier = SVC(kernel='linear', C=1.0)  # Linear kernel with regularization parameter C

# Train the SVM model
svm_classifier.fit(X_train_tfidf, y_train_u)
In [ ]:
# Predictions on the test set
y_test_pred = svm_classifier.predict(X_test_tfidf)

# Predictions on the training set
y_train_pred = svm_classifier.predict(X_train_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy)

# Calculate accuracy on the training dataset
train_accuracy = accuracy_score(y_train_u, y_train_pred)
print("Training Accuracy:", train_accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_test_pred))
In [ ]:
from sklearn.metrics import confusion_matrix

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Sentiment')
plt.ylabel('True Sentiment')
plt.show()
In [93]:
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train_u)

# Convert text data to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train_u)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure they have the same length
X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)
In [95]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np

# Assuming you have preprocessed your data and have X_train, y_train, X_test, y_test


# Define the embedding layer without weights
embedding_layer = Embedding(input_dim=10000, output_dim=16)

# Define RNN model
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
In [98]:
print("X_train_pad shape:", X_train_pad.shape)
print("y_train shape:", y_train_u.shape)
print("X_test_pad shape:", X_test_pad.shape)
print("y_test shape:", y_test.shape)
X_train_pad shape: (1738088, 100)
y_train shape: (1738088,)
X_test_pad shape: (1257251, 100)
y_test shape: (1257251,)
In [99]:
# Train the model
history = model.fit(X_train_pad, y_train_u, epochs=5, batch_size=128, validation_data=(X_test_pad, y_test))
Epoch 1/5
13579/13579 ━━━━━━━━━━━━━━━━━━━━ 357s 26ms/step - accuracy: 0.9316 - loss: 0.1723 - val_accuracy: 0.9657 - val_loss: 0.0941
Epoch 2/5
13579/13579 ━━━━━━━━━━━━━━━━━━━━ 354s 26ms/step - accuracy: 0.9607 - loss: 0.1055 - val_accuracy: 0.9670 - val_loss: 0.0872
Epoch 3/5
13579/13579 ━━━━━━━━━━━━━━━━━━━━ 362s 27ms/step - accuracy: 0.9652 - loss: 0.0940 - val_accuracy: 0.9709 - val_loss: 0.0798
Epoch 4/5
13579/13579 ━━━━━━━━━━━━━━━━━━━━ 363s 27ms/step - accuracy: 0.9678 - loss: 0.0872 - val_accuracy: 0.9730 - val_loss: 0.0745
Epoch 5/5
13579/13579 ━━━━━━━━━━━━━━━━━━━━ 367s 27ms/step - accuracy: 0.9702 - loss: 0.0817 - val_accuracy: 0.9692 - val_loss: 0.0831
In [101]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy}")

_, train_accuracy = model.evaluate(X_train_pad, y_train_u)
print(f"Training Accuracy: {train_accuracy}")
39290/39290 ━━━━━━━━━━━━━━━━━━━━ 160s 4ms/step - accuracy: 0.9689 - loss: 0.0841
Test Accuracy: 0.9692217111587524
54316/54316 ━━━━━━━━━━━━━━━━━━━━ 239s 4ms/step - accuracy: 0.9793 - loss: 0.0615
Training Accuracy: 0.9735997319221497
In [103]:
# Save the model
model.save("lstm_model1.h5")
WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. 
In [102]:
# Extract the training and validation accuracy from the history
train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

# Plot the training and validation accuracy
plt.plot(train_accuracy, label='Training Accuracy')
plt.plot(val_accuracy, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy over Epochs')
plt.legend()
plt.show()
No description has been provided for this image

Model Interpretation¶

In [65]:
from lime.lime_text import LimeTextExplainer
import pickle

# Load the preprocessed test data
df_test = pd.DataFrame({'preprocessed_text': X_test, 'sentiment': y_test})

# Load the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as file:
    loaded_tfidf_vectorizer = pickle.load(file)

# Load the trained model from the pickle file
with open('C:/Users/ndhu2/Desktop/Term_3/6_Natural Language Processing/Project/Model3/nb_bin_model3.pkl', 'rb') as file:
    nb_classifier = pickle.load(file)

# Define a function to classify text using the trained model
def nb_predict_proba(texts):
    vectors = loaded_tfidf_vectorizer.transform(texts)
    return nb_classifier.predict_proba(vectors)

# Initialize LIME TextExplainer
explainer = LimeTextExplainer(class_names=['negative', 'positive'])

# Choose a random instance from the test set for explanation
idx = np.random.randint(len(X_test))
text_instance = X_test.iloc[idx]
true_label = y_test.iloc[idx]

# Explain the prediction for the random instance
explanation = explainer.explain_instance(text_instance, nb_predict_proba, num_features=10, top_labels=1)

# Print the explanation
print('Text instance:', text_instance)
print('True label:', true_label)
print('Predicted label:', nb_classifier.predict(loaded_tfidf_vectorizer.transform([text_instance]))[0])

# Get the top features and their weights for the predicted label
top_features = explanation.as_list(label=explanation.top_labels[0])

# Print the top features and their weights
for feature, weight in top_features:
    print(f"{feature}: {weight}")

# Show the explanation in the notebook
explanation.show_in_notebook(text=text_instance)
Text instance: friend recommended place excited go however poor service well subpar food highly disappointed server busy catering large table really not_pay attention husband ask glass water 4 time whole fish came 30 minute ordered table next u ordered 15 minute u got there time brussel sprout greasy not_even crispy reason giving place 2 not_1 star fish duck fat fry not_be coming
True label: 0
Predicted label: 0
poor: 0.21160499439554004
subpar: 0.20413545671917072
not_even: 0.16699528039868544
not_pay: 0.1628039063187502
not_1: 0.16013969728307917
disappointed: 0.11251325186433776
minute: 0.11204368151556912
not_be: 0.11072357467954254
30: 0.10755731748425412
highly: -0.06834064308931294
In [67]:
import numpy as np
from lime.lime_text import LimeTextExplainer
import pickle

# Load the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as file:
    loaded_tfidf_vectorizer = pickle.load(file)

# Load the trained model from the pickle file
with open('C:/Users/ndhu2/Desktop/Term_3/6_Natural Language Processing/Project/Model3/nb_bin_model3.pkl', 'rb') as file:
    nb_classifier = pickle.load(file)

# Define a function to classify text using the trained model
def nb_predict_proba(texts):
    vectors = loaded_tfidf_vectorizer.transform(texts)
    return nb_classifier.predict_proba(vectors)

# Initialize LIME TextExplainer
explainer = LimeTextExplainer(class_names=['negative', 'positive'])

# Input your review here
text_instance = "friend recommended place excited go however poor service well subpar food highly disappointed server busy catering large table really not_pay attention husband ask glass water 4 time whole fish came 30 minute ordered table next u ordered 15 minute u got there time brussel sprout greasy not_even crispy reason giving place 2 not_1 star fish duck fat fry not_be coming"

# Preprocess the text instance
preprocessed_text_instance = preprocess_text(text_instance)

# Explain the prediction for the preprocessed text instance
explanation = explainer.explain_instance(preprocessed_text_instance, nb_predict_proba, num_features=10, top_labels=1)

# Print the explanation
print('Text instance:', text_instance)
print('Text instance:', preprocessed_text_instance)
print('Predicted label:', nb_classifier.predict(loaded_tfidf_vectorizer.transform([text_instance]))[0])

# Get the top features and their weights for the predicted label
top_features = explanation.as_list(label=explanation.top_labels[0])

# Print the top features and their weights
for feature, weight in top_features:
    print(f"{feature}: {weight}")

# Show the explanation in the notebook
explanation.show_in_notebook(text=preprocessed_text_instance)
Text instance: friend recommended place excited go however poor service well subpar food highly disappointed server busy catering large table really not_pay attention husband ask glass water 4 time whole fish came 30 minute ordered table next u ordered 15 minute u got there time brussel sprout greasy not_even crispy reason giving place 2 not_1 star fish duck fat fry not_be coming
Text instance: friend recommended place excited go however poor service well subpar food highly disappointed server busy catering large table really not_pay attention husband ask glass water 4 time whole fish came 30 minute ordered table next u ordered 15 minute u got time brussel sprout greasy not_even crispy reason giving place 2 not_1 star fish duck fat fry not_be coming
Predicted label: 0
poor: 0.22694478715492192
subpar: 0.19242713838680955
not_pay: 0.16451148406906657
not_1: 0.16102168894280453
not_even: 0.1532261083751255
30: 0.13008937176242705
minute: 0.12345846174484312
not_be: 0.11608615751599044
disappointed: 0.11568269403809768
highly: -0.07064456285448852
In [46]:
from lime.lime_text import LimeTextExplainer
import pickle

# Load the preprocessed test data
df_test = pd.DataFrame({'preprocessed_text': X_test, 'sentiment': y_test})

# Load the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as file:
    loaded_tfidf_vectorizer = pickle.load(file)

# Load the trained model from the pickle file
with open('C:/Users/ndhu2/Desktop/Term_3/6_Natural Language Processing/Project/Model3/nb_bin_model3.pkl', 'rb') as file:
    nb_classifier = pickle.load(file)

# Define a function to classify text using the trained model
def nb_predict_proba(texts):
    vectors = loaded_tfidf_vectorizer.transform(texts)
    return nb_classifier.predict_proba(vectors)

# Initialize LIME TextExplainer
explainer = LimeTextExplainer(class_names=['negative', 'positive'])

# Aggregate important words
important_words = {}

# Iterate over at least 50 reviews
for idx in range(50):
    # Choose a random instance from the test set for explanation
    text_instance = X_test.iloc[idx]
    true_label = y_test.iloc[idx]

    # Explain the prediction for the random instance
    explanation = explainer.explain_instance(text_instance, nb_predict_proba, num_features=10, top_labels=1)

    # Get the top features and their weights for the predicted label
    top_features = explanation.as_list(label=explanation.top_labels[0])

    # Aggregate the important words
    for feature, weight in top_features:
        if feature in important_words:
            important_words[feature] += weight
        else:
            important_words[feature] = weight

# Normalize the weights if needed
total_weight = sum(important_words.values())
important_words = {k: v/total_weight for k, v in important_words.items()}

# Sort the important words by weight
important_words = dict(sorted(important_words.items(), key=lambda item: item[1], reverse=True))

# Print or analyze the important words
for word, weight in important_words.items():
    print(f"{word}: {weight}")
phone: 0.0685157773880253
email: 0.03604740988252881
paid: 0.0354131963833722
suck: 0.028149012817759195
called: 0.026233646484694697
excuse: 0.025800836269966848
inattention: 0.02570275023257392
administration: 0.024443757802595858
half: 0.02328105361903188
carpet: 0.02322912205927775
attitude: 0.02288640903290162
hallway: 0.022868995177190158
industry: 0.022521533251605354
never: 0.022193439642317035
soggy: 0.021641088610639597
experienced: 0.020981899709956436
said: 0.020795231916628985
min: 0.02021370604593308
disappointed: 0.018554123200798717
15: 0.018407337816177267
dropped: 0.017516243771648822
disrespectful: 0.017471681238841318
literally: 0.016798948665660264
hostess: 0.016682130974701122
changed: 0.015885289535455926
expiration: 0.015645001202966885
arrive: 0.014142613752375794
rude: 0.013617695173746908
could: 0.013599139839455227
disinfectant: 0.013538066041098381
inconsiderate: 0.012874705695045592
wipe: 0.012768224522495309
money: 0.012762778171706603
walmart: 0.012715772033434866
say: 0.012309599002702407
call: 0.012223168288785273
proceeded: 0.011586689930983187
ruined: 0.011339292251219388
100: 0.011232416821351609
eh: 0.01105863328833015
car: 0.011048374753406661
window: 0.01076026750340935
guess: 0.01053730123183656
left: 0.010453075269575136
hour: 0.010310874433751149
fail: 0.01024505593266344
comment: 0.010226298750247166
would: 0.010080674179301054
enjoyed: 0.010034445094516127
unfortunately: 0.009936573450198907
flu: 0.009790051233331617
refunded: 0.009575809216777682
fault: 0.009408671692091622
rant: 0.009350390961346523
stated: 0.00917779480840085
delicious: 0.008973338096866155
charge: 0.008683797165814074
stolen: 0.008674868248512423
fishy: 0.008463277850371867
going: 0.008436414824917568
response: 0.008344126541345313
extensive: 0.00819102705907891
perfect: 0.007690225789779001
contact: 0.007426294254051179
emailed: 0.007293908718269971
sorry: 0.006996833413161035
40: 0.00655045635547875
eviction: 0.006395983717344581
minute: 0.006329270416015321
another: 0.006205343737116793
asking: 0.006024228566726314
worse: 0.005992759886056528
appointment: 0.005747327760490766
50: 0.005715234628420314
forbid: 0.00556048697083014
canceled: 0.004926332582322096
fantastic: 0.004785472664770513
told: 0.004395616429337232
loved: 0.004291213121441303
somebody: 0.004247760114773865
understand: 0.004242298153325464
meaty: 0.0041894004475928285
bother: 0.004068377308423213
fresh: 0.002679939375884708
asked: 0.0025147510507486136
unwelcoming: 0.0020155888806332976
nigger: 0.001822524892665566
visibly: 0.0017707496994619084
tasty: 0.0016872853300274652
caucasian: 0.0016453484211849032
breakfast: 0.001574048345578634
outdated: 0.0015378199630342623
fuck: 0.0015186032086513758
bitch: 0.00148837011158734
embarrassed: 0.001442643722918538
uncomfortable: 0.001387866188874136
thanks: 0.001372725150520641
little: 0.0012931479962975552
cancelled: 0.000228155822599848
update: 0.00022114271549712554
advising: 0.00021409426558062554
post: 0.00020943003772807618
sadly: 0.00019583391079738884
spoke: 0.00019456986953886364
abruptly: 0.0001933551205657392
spa: 0.0001894985253882692
incredible: 0.00011116878126463624
great: 7.985895862032405e-05
vibe: 6.526236216291725e-05
pleasantly: 5.69036627905405e-05
comfortable: 4.163235963622531e-05
spacious: 4.1525285053902235e-05
adventure: 4.012596976004274e-05
modern: 3.300135054514177e-05
jawn: 2.9526635762029154e-05
yum: 2.8331284732003837e-05
good: 2.3184925114363835e-05
amazing: 1.7635005716509283e-05
raspberry: 1.7576813179436042e-05
knowledgeable: 1.6582819582198083e-05
town: 1.612617566749203e-05
everything: 1.5468477397719526e-05
instagram: 1.488256309505478e-05
atmosphere: 1.3506666560818322e-05
surprised: 1.2555938975828949e-05
crowded: 8.83712393828958e-06
flavorful: 8.649691870745958e-06
selection: 8.461496170813175e-06
parking: 8.356753752182271e-06
cleanest: 7.544238597917003e-06
generous: 7.4994652648128985e-06
offer: 7.046840335037891e-06
cigar: 7.038773488300331e-06
cocktail: 6.712936497147508e-06
die: 5.555120056690995e-06
fast: 5.55428094408912e-06
place: 5.4733502525905034e-06
awesome: 5.35509153066075e-06
relaxing: 5.287059395753344e-06
mary: 4.752121221936916e-06
deli: 4.088691028288867e-06
pudding: 3.903106885514056e-06
fabulous: 3.878837322278016e-06
pleased: 3.5533282763979564e-06
helpful: 3.2070409413710247e-06
champagne: 2.972861385369738e-06
plus: 2.8087983855605457e-06
orleans: 2.5860833189833933e-06
dat: 2.3902847973843277e-06
blackened: 2.1543418090877566e-06
flavor: 2.1473652843097682e-06
nice: 1.942773005631924e-06
alex: 1.7248692126332233e-06
super: 1.6709606149150086e-06
excellent: 1.6683069094798176e-06
unassuming: 1.5538693054070544e-06
mexican: 1.4639508196247593e-06
spot: 1.4597662821806646e-06
egg: 1.4506748227265227e-06
phenomenal: 1.3950114402615038e-06
helped: 1.3832256273683992e-06
refreshing: 1.2398753885540714e-06
robust: 1.1856900899899766e-06
mint: 1.1578551072057765e-06
cafe: 1.1508643523059103e-06
relaxed: 1.1136514784111252e-06
fave: 1.1019731986630542e-06
efficient: 1.0702145753397589e-06
free: 1.0313823358175282e-06
music: 9.374314210327356e-07
accommodating: 9.257100506772497e-07
gorgeous: 9.122611149084814e-07
rachel: 8.052448168176733e-07
team: 7.846923052474428e-07
neighborhood: 7.269136167262318e-07
pistachio: 6.95156332549067e-07
pup: 6.882503013696034e-07
socialization: 6.790000994183223e-07
freshest: 6.661099633392638e-07
bbq: 5.611532803302918e-07
luna: 5.600461308591414e-07
magic: 5.423153327083264e-07
cannolis: 5.408825457194764e-07
filling: 4.659662516910441e-07
visiting: 4.6238406884429605e-07
cannoli: 4.563852961465635e-07
cute: 4.2192895536105323e-07
perfectly: 3.935404534274366e-07
robyn: 3.8955477378292144e-07
outdoor: 3.632129523116086e-07
crowd: 3.5387983698575093e-07
verdura: 3.4045833576471894e-07
rabe: 3.1512473706957837e-07
roasted: 3.1164809779896456e-07
burrata: 3.0324822878091053e-07
handmade: 2.981970334496181e-07
seeded: 2.96388963681357e-07
knowledgable: 2.9585648129625284e-07
obsession: 2.831758794197096e-07
ingredient: 2.739605741031535e-07
brunch: 2.53706119676107e-07
pop: 2.487459689359421e-07
authentic: 2.449594209161656e-07
camp: 2.247856413300624e-07
recommendation: 2.2436883117646956e-07
glad: 2.2375020145729127e-07
brandy: 2.0902365905315557e-07
taco: 2.0248868823361895e-07
crisp: 1.9440864836014732e-07
pork: 1.9077708702894763e-07
quail: 1.7121464377966933e-07
cauliflower: 1.673076209510584e-07
mature: 1.4721538514271802e-07
apple: 1.373326879332264e-07
chop: 1.334548656935044e-07
farm: 1.1869244743321804e-07
fried: 8.681013357513295e-08
twinkling: 5.582334211426152e-08
smoked: 5.243234098472865e-08
whenever: 4.615029758900991e-08
huge: 4.503704855448237e-08
opted: 4.294842509884994e-08
nestled: 3.9187881015284615e-08
special: 3.824246605112361e-08
sehr: 3.723068657824672e-08
food: 1.4829301206004597e-08
drowned: 9.22703992445554e-09
nearly: 8.769351144981087e-09
counter: 7.0341336157727915e-09
something: 4.0775458515528605e-09
joke: -4.9606549563216225e-09
starting: -1.8387173991169526e-08
health: -9.534759689811138e-08
else: -9.898902986703521e-08
brought: -1.7924489807039915e-07
go: -2.0050643141464257e-07
day: -2.0965023888534188e-07
got: -2.1017035013397833e-07
14: -2.181240648366245e-07
check: -2.1887095110952963e-07
without: -2.600556071783093e-07
ignore: -2.7996575960575456e-07
long: -3.52199367808353e-07
solely: -3.608053128530278e-07
missed: -3.671222554646579e-07
trying: -3.9712856562284974e-07
skin: -4.6217152377063516e-07
container: -5.184999604368133e-07
based: -5.592618453427498e-07
away: -5.785891095005176e-07
complete: -6.105693428263065e-07
woman: -6.950046515706071e-07
wanted: -7.977945303153912e-07
training: -8.281916050299413e-07
work: -9.396489702086812e-07
quality: -1.01736404295172e-06
80: -1.062339123593751e-06
bland: -1.1601503277891734e-06
service: -1.1898880993357882e-06
gotten: -1.2934582601973758e-06
want: -1.3549582812421482e-06
buy: -1.455752095756881e-06
cash: -1.6036252106888743e-06
kitchen: -2.208137532402526e-06
explained: -2.243821797899918e-06
customer: -2.3676043158865408e-06
toooooo: -2.433736247306593e-06
anything: -2.693799248620165e-06
10: -2.9883773734722908e-06
spoiled: -3.402050132572048e-06
door: -3.4423959682408115e-06
twice: -3.984204253859772e-06
90: -4.7032868371827575e-06
waiting: -5.321203597463813e-06
please: -6.275078499391923e-06
time: -6.62302022803796e-06
stay: -7.267163027980131e-06
spend: -8.243539877443775e-06
deal: -8.254958474007789e-06
one: -8.894031380093509e-06
dunkin: -9.25838631687393e-06
hard: -9.281816998349412e-06
hotel: -9.49097296238249e-06
better: -9.587287213617943e-06
stepped: -1.0624570754605655e-05
later: -1.1954440402291877e-05
way: -1.2785971132295523e-05
given: -1.3239094011962047e-05
nothing: -1.3781537284890898e-05
employee: -1.6413380228167497e-05
problem: -2.007802361326408e-05
pick: -2.023325091933217e-05
ordered: -2.0506317142289972e-05
kno: -2.1872518937179815e-05
take: -2.266327986847173e-05
write: -2.3272297378818022e-05
brag: -2.3364098998436573e-05
expecting: -2.4474039995143885e-05
earlier: -2.5640084318040904e-05
logical: -2.5985462421336526e-05
encountered: -2.693958662503368e-05
approved: -2.8161991959679447e-05
temperature: -3.052076968921325e-05
entirely: -3.23676209251275e-05
security: -3.6357993280583676e-05
estimated: -3.7055073230609636e-05
leave: -4.1552635951755976e-05
clearly: -4.179230352701787e-05
forgettable: -4.5761567378273315e-05
ok: -0.0003813372216789702
received: -0.0005698087456068185
worth: -0.0008168081498281124
easy: -0.0009273584374386028
single: -0.001209269132780663
basic: -0.0012224408849066401
get: -0.0012439323132689443
ago: -0.0013294467676720101
even: -0.0013575886258995215
best: -0.001370568130493848
review: -0.0014167989987409412
compassionate: -0.0014397453837893491
two: -0.0014741023927823632
homey: -0.001475653535963959
possible: -0.001531607258712582
manager: -0.0015604561228142288
complained: -0.0016249007173343214
issue: -0.0016454559511670148
turned: -0.0016724924423614225
noone: -0.0017910461787501452
give: -0.0018264747585328594
order: -0.0018409913037869734
computer: -0.001946157886845705
kick: -0.001959121352959343
desk: -0.002060553129364634
ask: -0.0024458369609634645
friendly: -0.002523725102678545
sprite: -0.002993940948672193
um: -0.0031710994397280297
wonderful: -0.004123746702448322
love: -0.004148046260369466
highly: -0.004535849430192918
recommend: -0.005086408401837509
missing: -0.005799792699334027
instead: -0.006102141275695025
always: -0.006808301014697158
posted: -0.007015931790672499
locally: -0.00834393240020323
approachable: -0.008785900064654609
dramatically: -0.010127012933660987
outstanding: -0.010185368877089162
calling: -0.010324286662243231
definitely: -0.010931056369454399
favorite: -0.011765854892737381
In [34]:
nb_predict_proba([X_test.iloc[2]])
Out[34]:
array([[2.92973824e-07, 9.99999707e-01]])
In [114]:
lstm_predict_proba
Out[114]:
array([[0.05784018]], dtype=float32)
In [120]:
X_test.shape
Out[120]:
(1257251,)
In [127]:
from lime.lime_text import LimeTextExplainer
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming X_train, y_train, X_test, y_test are your training and test data
# Assuming tokenizer is your text tokenizer
# Assuming max_sequence_length is the maximum length of your padded sequences

# Load the trained model from the h5 file
lstm_model = tf.keras.models.load_model("lstm_model1.h5")

# Choose a random instance index from the test set for explanation
idx = np.random.randint(len(X_test))
text_instance = X_test.iloc[idx]

# Tokenize and pad the text instance
text_instance_seq = tokenizer.texts_to_sequences([text_instance])
text_instance_pad = pad_sequences(text_instance_seq, maxlen=100)

# Define the predict_proba_fn function
def predict_proba_fn(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    sequences_pad = pad_sequences(sequences, maxlen=100)
    return lstm_model.predict(sequences_pad)

# Initialize LIME TextExplainer
explainer = LimeTextExplainer(class_names=['negative', 'positive'])

# Explain the prediction for the random instance
explanation = explainer.explain_instance(text_instance, predict_proba_fn, num_features=10, top_labels=1)

# Print the explanation
print('Text instance:', text_instance)
print('True label:', y_test.iloc[idx])
print('Predicted label:', np.argmax(lstm_model.predict(text_instance_pad)))
explanation.show_in_notebook(text=text_instance)
WARNING:absl:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.
157/157 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step
Text instance: dinner last night pleasantly surprised since read negative review place beforehand I got say not_clue talking burger great cake even better staff friendly supervisor even went table table making sure everyone good dining experience rarely see anymore place definitely back
True label: 1
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step
Predicted label: 0
In [133]:
 
Out[133]:
1
In [134]:
# Assuming X_test is your test data and y_test is the corresponding labels

# Choose 50 random indices from the test set
indices = np.random.choice(len(X_test), size=50, replace=False)

# Iterate over the selected indices and predict the sentiment for each record
for idx in indices:
    text_instance = X_test.iloc[idx]
    text_instance_seq = tokenizer.texts_to_sequences([text_instance])
    text_instance_pad = pad_sequences(text_instance_seq, maxlen=100)
    
    # Predict the sentiment
    sentiment_prob = lstm_model.predict(text_instance_pad)[0]
    predicted_sentiment = 'positive' if sentiment_prob[0] > 0.5 else 'negative'
    true_sentiment = 'positive' if y_test.iloc[idx] == 1 else 'negative'
    
    # Print the prediction
    print('Text:', text_instance)
    print('True sentiment:', true_sentiment)
    print('Predicted sentiment:', predicted_sentiment)
    print()
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 21ms/step
Text: scheduled 8am called say coming 8am 9am professional courteous wanted would definitely recommend others
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step
Text: family 4 made reservation monday evening restaurant 25 full immediately seated left sitting table 17 minute without ever someone come acknowledge sat not_drink order not_water nothing many good option area spend money place show lack consideration customer
True sentiment: negative
Predicted sentiment: negative

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 17ms/step
Text: ambiance wonderful brass unique seat think traditionally dressed staff food average though went dinner got good assortment food dish bit lacking flavor profile lamb iskender gyro meat not_wow portion nicely sized though lot filler item plate fry fried bread meat dish made think twice price quantity overall may go back couple month operation new place perhaps grow better another turkish place louis still heart plus right across street great frozen custard place
True sentiment: negative
Predicted sentiment: negative

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step
Text: well I officially clayton resident new place minute walking distance combo kaldi quickly becoming morning office not_express happy make able wake short walk away good breakfast awesome coffee free topic one complaint hour limit sure give new username password free need next hour happens middle big project wireless time run major frustration let tell not_extend little longer food enough satisfy new favorite thing order morning quiche day yesterday served breakfast potato sweet potato regular banana perfect cup cinnamon hazelnut coffee topic banana random little mom would always slice banana put bowl milk give spoon anyone else really diverse crowd morning fun people watch catching news drinking coffee actually ideal way spend every single morning chance stop see banana milk course
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step
Text: anniversary dinner place great say not_hardly imagine anyone could score four star start service outstanding justin clearly take profession seriously family plenty time chat course coming neither rush slow literally right came time recommendation encouraged u provided additional information led extraordinary dining selection salmon impeccable beef even better several good restaurant fine main course eastland raised bar even higher pairing main course excellent vegetable eating enjoying part meal realized big portion wife decided order side item omg green chile mac cheese fried green tomato also superb nothing frozen not_shame u asked doggie bag meal great I sure next serving good longtime nashvillian I bit embarrassed family discovered eastland cafe food atmosphere service price made experience remember quite time
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step
Text: first food overall get 5 star service little slow place not_overly busy would star could beverage pretty boy spicy watermelon margarita wonderful neither sweet pretty boy definitely stronger kick watermelon marg enough spice slight heat without making drink hot enjoy fruity flavor apps kept simple rolled chip sauce trio tres amigo salsa honestly star roast tomato wood fire oven smokey flavor definitely pull order queso guacamole well meal mom enchilada opted spinach mushroom went chorizo exceptional however agreed spinach option better lime used really pulled overall dish super light comparison expected pleasant surprise great vegetarian option know not_pictured street corn could not_wait try one favorite food devoured could snag picture personally could smidge heat bowl empty left know worth checking definitely splitting item menu still explore area drink desired
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 21ms/step
Text: I traveling greenwood last year work thought starbucks coffee town today coffee shop go great menu friendly baristas polite knowledgeable reasonably priced coffee make great flat white also lot room sit work
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 19ms/step
Text: I glad took drive shop search kayak done research still unsure kayak would fit need use scott helpful asking question listening need installed crossbar vehicle showed proper way strap kayak would highly recommend anyone looking purchase kayak canoe stop not_be disappointed
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step
Text: enjoyed time effie dining going kimmel center block northwest not_know byob informed like glass wine dinner wanted get full effie experience went may gone bit overboard appetizer got spanakopita far favorite table also got fried zucchini chip came amazing dipping sauce calamari pretty good came really key lemon slice marinara ended relatively full first course soldiered boyfriend got gyro platter really enjoyed meat pea think not_too appetizing overshadowed main attraction mom shared mixed grill ultimate greek platter gyro sausage lamb shank lamb kabob chicken kabob might came fry could not_eat one decent tzatziki tiniest little greek salad ever seen server sweet sure fill water glass food came quickly door plenty time theater come back I bringing wine going plan stay savor thing even bring course
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 24ms/step
Text: good service excellent ordered philly chicken sandwich also ordered egg plant fry delicious
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step
Text: reached gate 1 travel hope could assist upcoming trip croatia attending wedding croatia hotel arrangement 2 night trip looking go 10 day already something arranged two night trip said sorry not_help rude never waste time reaching thanks nothing gate 1
True sentiment: negative
Predicted sentiment: negative

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step
Text: went open 3 day drink pollo panni focscia bread al good went back week later ordered came hamburger bun rip ordered johnny walker black rock charged drink went back today lunch not_lunch bar menu anymore waitress said new menu two salad one beer coffee foodie dine central south jersey well philly place horrible go rt 130 little bring bottle wine fratelli super food reasonable priced avoid portico bad deal
True sentiment: negative
Predicted sentiment: negative

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step
Text: new favorite sandwich place santa barbara highly coveted position really feel like sandwich worth money atmosphere fun owner hysterically funny star would highly recommend
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step
Text: looking forward trying fresh food pretty disappointed place food drink barely mediocre overall atmosphere pretty stale boring service great waiter helpful answering question went small group girl check not_separated found odd inconvenient not_be returning
True sentiment: positive
Predicted sentiment: negative

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step
Text: friend went percy day first time say food great cornbread good full slab rib delicious cocktail much needed drink hot day course try award winning pecan pie okay okay try see good mine good pie competition although pie die not_pleased cost ice cream would think would come oh delicious pecan pie digress would recommend everyone try place I back
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 17ms/step
Text: love first experience ok go food always fresh meat tender delicious
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step
Text: excellent walking stick look gorgeous perfect trip london
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 21ms/step
Text: would definitely give food cafe abyssinia five even star love veggie combo eaten ethiopian many time food rank among favorite restaurant homey feel almost like eating someone private residence nice sit outside unfortunate car park right next patio occasionally smell exhaust fume eat otherwise I huge fan byob nice
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step
Text: forgotten local took whole fish menu truly tourist attraction miss u season
True sentiment: negative
Predicted sentiment: negative

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 15ms/step
Text: dr lundeen work right would normal person want result surgery repair left dr lundeen not_doing surgery
True sentiment: negative
Predicted sentiment: negative

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step
Text: tasty food wonderful atmosphere thing wished marquee 9 appetizer deliciously sweet pineapple avocado dip plantain chip would gone little bit everything else tried sufficient portion dinner menu includes several cuban sandwich breakfast choice recommend calle ocho sandwich pickle roast pork side house salad made fresh dressing one highlight visit handmade drink beautifully presented bar new restaurant often crowded 6 9pm definitely get reservation
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 29ms/step
Text: heard place friend dropped car within hour done I detailing place place really get done high standard not_corners cut I not_do deep inspection car could tell getting car back immaculate job fair price nice people great service
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
Text: worst wendy ever gone rude terrible food scared eat meal hope never visit wendy
True sentiment: negative
Predicted sentiment: negative

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 16ms/step
Text: saturday afternoon decided needed haircut new look not_appointment walked scout thrilled kendra not_longer cutting hair skylar young lady assigned taken photo past wanted cut not_only listen wanted spent good amount time getting right went long hair decided time get style instead wearing time happy definitely going back scout specifically ask skylar thank
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 31ms/step
Text: poor quality hotel hilton not_be back level customer service could lot better shame management general manager not_respond review online kinda make wonder care hotel anyway chose another hotel not_much around hotel travel everywhere nothing see nearby
True sentiment: negative
Predicted sentiment: negative

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step
Text: best lentil soup I ever life must like spicy little kick come back husband town
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step
Text: priced basic seasoned food cocktail fine food limited way priced 35 2 piece chicken come want tell friend got avoid want good meal
True sentiment: negative
Predicted sentiment: negative

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 15ms/step
Text: definitely not_have good time came late night cold could not_enjoy view staff working winery daughter bad vibe staff not_that friendly excitable wine tasting bottle understand close end night ambiance not_good not_one really check either kind hassle return bar time get next tasting hoping seat not_get jacked hopefully staff work customer service friendliness help marketing retaining customer location thing would make return
True sentiment: negative
Predicted sentiment: negative

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step
Text: joshua server amazing informative eaten sushi made experience eau exciting food delicious pic best sushi I indy dessert world not_wait go back
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 15ms/step
Text: not_good hotel could better better service checked late got passed inquire everything bed stand employee shuttle driver michael turned around introduced offered everyone water cookie standard not_the exception
True sentiment: negative
Predicted sentiment: negative

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 29ms/step
Text: tour great way see different side new orleans great tour guide knew city well kept everything fun big plus bike super comfortable new orleans street highly recommend tour
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step
Text: tommy gunns look would never expect little shack ridge ave would pack huge punch barbecue flavor introduced gunns way back roommate manged place always blessed leftover deep fried mac cheese pork brisket sandwich course corn bread I sandwich pork brisket never better bbq side amazing must visit place higher end term cost worth mood
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 10ms/step
Text: tried lobster ravioli nice flavor really not_lobster seemed like regular ravioli lobster flavoring little disappointing giving 4 star good service alfredo dipping sauce good breadstick
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
Text: definitely best southern food I usually judge place meat green point kinda divey side murf rd lavernge well worth
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step
Text: shrimp oyster et tu fe oyster raw grilled particularly liked grilled cooked enough cook cheese top not_kill oyster cajun shrimp et tu fe outstanding overall may lack timely service may bit touristy make large portion especially side great happy hour friendly staff nice balcony
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step
Text: not_idea anything 1 start not_even describe order messed pizza cold wing gooey disgusting grab pizza costco deli heat avoid terrible experience
True sentiment: negative
Predicted sentiment: negative

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 24ms/step
Text: exton ups store made usual madness holiday shipping surprisingly easy delightful aside 5 minute wait cold safety prevents many customer inside time super easy young woman helped jazlyn simply best cheerful helpful sweet snuck right closing although clear slammed day friendly kind huge kudos ups store not_all go way support level customer service
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step
Text: food great recommend place anyone customer service impeccable julie great service point impressed return get back new orleans
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
Text: thank joy jose maintainence helping u transition chestnut hill village apartment initial issue unit solved thorough professional manner pleased result townhouse truly becoming home seeking
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 30ms/step
Text: made trek best german restaurant il good ever read previous review food consistently good menu not_changed not_need change german beer solid german cuisine prepared served really nice people reasonable price could one ask previous review accurate originally wrote least 10 time since read date food u
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 30ms/step
Text: breakfast not_i would like cafe con leche sorry machine broken I philly cheesesteak ok please add mushroom sorry not_mushrooms wtf first last visit food not_come yet please pray u
True sentiment: negative
Predicted sentiment: negative

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 16ms/step
Text: good food simple atmosphere store layout service quick efficient food prepared well seasoned sauced good thai food without americanized
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 16ms/step
Text: recently bought townhouse reno quickly realized get warmer used elected ceiling fan put every room turned trusty friend yelp requested several estimate I not_sure majority smoking recreational marijuana price lead time crazy olectric exception based positive yelp review decide give shot dale justin showed time ready work walked job pointed potential issue got guy not_weekend warrior selfers pro minimized potential issue great job budget ka ching would not_hesitate use recommend thanks olectric great job
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step
Text: iced chai tea latte not_on menu not_stop making one ask make exactly like level sweetness want milk spiciness try adjust mmmmmm drained not_time flat staff friendly helpful nothing seems range decor fun ton look inviting comfy place little way totally fit neighborhood neighboring shop great place exploring part town
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 22ms/step
Text: went drive thru cappuccino think person window name bryce say energy booboo friendly
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step
Text: veterinarian vet technician phenomenal seemed truly passionate job friendly gentle nervous dog took right away concern health one best vet I ever whenever know pet good hand
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 14ms/step
Text: visiting nola week stopped never expecting food amazing pimento cheese biscuit biscuit amazing sausage hint spice not_too spicy also ordered peanut butter hot chocolate must chocolate lover think liquid peanut butter cup better
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 15ms/step
Text: crappy service one 1 person workin counter phone ur bos get get discount cause not_have authority b u employee give permission speed bit 20 min get coherent tube ridiculous
True sentiment: negative
Predicted sentiment: negative

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 18ms/step
Text: rude tsa agent make beautiful easily navigate otherwise florida airport lose full star say rude mean yell face not_listen kindly trying explain issue russian roulette security screening may nice may ruin day horribly get tsa pre save trouble aside likely one top 3 favorite airport united state america great layout great charging station everywhere even cool airport started installing decent dining option great connection international city many direct flight offered domestic destination layover connecting flight usually make sense hope not_change recent expansion renovation improving seems accommodate traffic yet retain current state affair great location close go beautiful tampa bay beach petersburg easy access downtown tampa well speaking car several rent car company claim part huge parking garage stone throw elevator terminal baggage claim got someone picking dropping not_problem first hour parking also free I always happy land depart tpa dandy dandy tsa pre unless american airline ruin wiping account asked second last name added magically disappeared account morning grin
True sentiment: positive
Predicted sentiment: positive

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 19ms/step
Text: cheap sandwich good topping choice pick bread ton bakery option like served cool unless come night kind stale picked looked like decent grocery also low priced decent sandwich good spot check
True sentiment: positive
Predicted sentiment: positive